# Load necessary libraries
library(haven)
library(dplyr)
library(readstata13)
library(stringr)
library(margins)
library(readxl)
library(readr)
library(dplyr)
library(patchwork)
library(broom)
library(ggplot2)

# Load the data
survey_data <- read.dta13("W5_Malaysia_coreQmerged_20210819_release.dta")

filtered_data <- survey_data %>%
  filter(!Q34 %in% c("Missing", "Decline to answer"))

filtered_data <- filtered_data %>%
  filter(!Region %in% c("Borneo Malaysia"))

# Define PH parties
filtered_data$ph_vote <- ifelse(filtered_data$Q34 %in% 
                                  c("Pakatan Harapan (PH)",
                                    "Democratic Action Party (DAP)", 
                                    "Parti Keadilan Rakyat (PKR)",
                                    "Parti Amanah Negara (AMANAH)",
                                    "Parti Pribumi Bersatu Malaysia (BERSATU)"),
                                1, 0)
filtered_data$bn_vote <- ifelse(filtered_data$Q34 %in%
                                  c("Barisan Nasional (BN)"),1,0)

df_demo <- readxl::read_excel("MALAYSIA_2018_PARLIAMENT_RESULTS.xlsx")
df_comp <- read_csv("MALAYSIA_2018_PARLIAMENT_COMPOSITION.csv")

# Clean constituency code in both datasets
df_demo <- df_demo %>%
  mutate(KODPAR_clean = gsub("\\s+", "", KODPAR))

df_comp <- df_comp %>%
  mutate(KODPAR_clean = gsub("\\s+", "", `PARLIAMENTARY CONSTITUENCY CODE`))

# Perform a left join
df_joined <- df_demo %>%
  left_join(df_comp, by = "KODPAR_clean")

df_joined <- df_joined %>%
  filter(!STATE.x %in% c("WILAYAH PERSEKUTUAN LABUAN", "SABAH", "SARAWAK")) %>%
  filter(!KODPAR %in% c("P.115"))


# Define a named vector for mapping states to regions
state_to_region <- c(
  "PERLIS" = "Northern",
  "KEDAH" = "Northern",
  "PULAU PINANG" = "Northern",
  "PENANG" = "Northern",  # for alternate labeling
  "PERAK" = "Northern",
  "SELANGOR" = "Central",
  "WILAYAH PERSEKUTUAN KUALA LUMPUR" = "Central",
  "WILAYAH PERSEKUTUAN PUTRAJAYA" = "Central",
  "NEGERI SEMBILAN" = "Central",
  "MELAKA" = "Southern",
  "JOHOR" = "Southern",
  "PAHANG" = "Eastern",
  "KELANTAN" = "Eastern",
  "TERENGGANU" = "Eastern"
)

# Apply mapping to results data
df_joined <- df_joined %>%
  mutate(
    Region = state_to_region[STATE.x],
    UrbanRural = toupper(`URBAN-RURAL CLASSIFICATION (2018)`),
    Malay = `MALAY (%)` / 100,
    Chinese = `CHINESE (%)` / 100,
    Indian = `INDIANS (%)` / 100,
    Other = pmax(0, 1 - (Malay + Chinese + Indian))
  )

# Define IR13 to Urban.Rural mapping
# Clean IR13 values before mapping
filtered_data$IR13_clean <- trimws(as.character(filtered_data$IR13))

# Define the mapping again
ir13_to_urban_rural <- c(
  "Capital or Megacity (1 million population plus)" = "URBAN",
  "Regional center or Other major cities (100,000 plus)" = "URBAN",
  "Small city or town (less than 100,000 people)" = "SEMI URBAN",
  "Village or countryside" = "RURAL"
)

# Apply the mapping to a new column
filtered_data$UrbanRural <- ir13_to_urban_rural[filtered_data$IR13_clean]

filtered_data$Se11a_clean <- trimws(as.character(filtered_data$Se11a))

filtered_data <- filtered_data %>%
  mutate(Region = str_replace(Region, "_\\(.*\\)", ""))

filtered_data$EthnicGroup <- case_when(
  grepl("chinese", filtered_data$Se11a_clean, ignore.case = TRUE) ~ "Chinese",
  grepl("malay", filtered_data$Se11a_clean, ignore.case = TRUE) ~ "Malay",
  grepl("indian", filtered_data$Se11a_clean, ignore.case = TRUE) ~ "Indian",
  TRUE ~ "Other"
)

filtered_data <- filtered_data %>%
  mutate(Malay_NonMalay = ifelse(EthnicGroup 
                                 %in% c("Chinese", "Indian"), "Non-Malay", "Malay"))


# Table 7
model_basic <- glm(bn_vote ~ UrbanRural * Malay_NonMalay, 
                   data = filtered_data, family = binomial)

# Figure 9

newdata <- expand.grid(
  UrbanRural = c("RURAL", "SEMI URBAN", "URBAN"),
  Malay_NonMalay = c("Malay", "Non-Malay")
)

pred <- predict(model_basic, newdata, type = "link", se.fit = TRUE)
newdata$prob <- plogis(pred$fit)
newdata$prob_lower <- plogis(pred$fit - 1.96 * pred$se.fit)
newdata$prob_upper <- plogis(pred$fit + 1.96 * pred$se.fit)

# Odds Ratio
or_table <- tidy(model_basic, exponentiate = TRUE, conf.int = TRUE)

# Filter and label odds ratios of interest
or_df <- or_table %>%
  filter(term %in% c(
    "UrbanRuralSEMI URBAN",
    "UrbanRuralURBAN",
    "Malay_NonMalayNon-Malay",
    "UrbanRuralURBAN:Malay_NonMalayNon-Malay",
  )) %>%
  mutate(
    Comparison = case_when(
      term == "UrbanRuralSEMI URBAN" ~ "SemiUrban vs Rural (Malay)",
      term == "UrbanRuralURBAN" ~ "Urban vs Rural (Malay)",
      term == "UrbanRuralSEMI URBAN:Malay_NonMalayNon-Malay" ~ "SemiUrban vs Rural (Non-Malay)",
      term == "UrbanRuralURBAN:Malay_NonMalayNon-Malay" ~ "Urban vs Rural (Non-Malay)"
    )
  )


ggplot(or_df, aes(x = Comparison, y = estimate)) +
  geom_point(size = 3, color = "steelblue") +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2, color = "steelblue") +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
  labs(
    title = "Odds Ratios: Urban/SemiUrban vs Rural",
    x = "Comparison",
    y = "Odds Ratio (with 95% CI)"
  ) +
  theme_minimal() +
  coord_flip()

# Multilevel logistic model of BN vote
model_mrp <- glmer(
  bn_vote ~ UrbanRural + (1 + UrbanRural | EthnicGroup) + (1 | Region),
  data = filtered_data,
  family = binomial
)

ideal_types_mrp <- expand.grid(
  EthnicGroup = unique(filtered_data$EthnicGroup),
  UrbanRural = unique(filtered_data$UrbanRural),
  Region = unique(filtered_data$Region)
)

ideal_types_mrp$bn_prob <- predict(model_mrp, newdata = ideal_types_mrp, 
                                   type = "response", allow.new.levels = TRUE)

df_joined$bn_est_mrp <- NA

for (i in 1:nrow(df_joined)) {
  d <- df_joined[i, ]
  
  bn_prob_malay <- ideal_types_mrp$bn_prob[ideal_types_mrp$EthnicGroup == "Malay" & 
                                             ideal_types_mrp$UrbanRural == d$UrbanRural & 
                                             ideal_types_mrp$Region == d$Region]
  bn_prob_chinese <- ideal_types_mrp$bn_prob[ideal_types_mrp$EthnicGroup == "Chinese" & 
                                               ideal_types_mrp$UrbanRural == d$UrbanRural & 
                                               ideal_types_mrp$Region == d$Region]
  bn_prob_indian <- ideal_types_mrp$bn_prob[ideal_types_mrp$EthnicGroup == "Indian" & 
                                              ideal_types_mrp$UrbanRural == d$UrbanRural & 
                                              ideal_types_mrp$Region == d$Region]
  bn_prob_other <- ideal_types_mrp$bn_prob[ideal_types_mrp$EthnicGroup == "Other" & 
                                             ideal_types_mrp$UrbanRural == d$UrbanRural & 
                                             ideal_types_mrp$Region == d$Region]
  
  df_joined$bn_est_mrp[i] <- sum(
    bn_prob_malay * d$Malay,
    bn_prob_chinese * d$Chinese,
    bn_prob_indian * d$Indian,
    bn_prob_other * d$Other,
    na.rm = TRUE
  )
}

mae_mrp <- mean(abs(df_joined$bn_est_mrp - df_joined$`BN.Share`), na.rm = TRUE)
cor_mrp <- cor(df_joined$bn_est_mrp, df_joined$`BN.Share`, use = "complete.obs")

cat("MRP Model MAE:", mae_mrp, "\n")
cat("Correlation with actual BN vote:", cor_mrp, "\n")

ggplot(df_joined, aes(x = `BN.Share`, y = bn_est_mrp)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "purple") +
  labs(#title = "BN: MRP Estimate vs Actual",
       x = "Actual BN Vote Share", y = "Predicted BN (MRP)")
